# Libraries for parsing data
import os
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup
import re
import numpy as np
path_dropbox = "D:/Dropbox/Research/China Foreign Share Discount"
calls = pd.read_pickle(path_dropbox + '/Conference Call Transcript/transcript.pkl')
calls['ctry'] = calls['ISIN'].str[:2]
calls = calls[~calls['ctry'].isin([None])] #delete missing only
institutions_column = []
for participants in calls['participants']:
institutions = []
for participant in participants:
person_institution = participant.split('\n')
if len(person_institution) == 2:
institution_position = person_institution[1].upper()
institution = institution_position.split('-')[0]
if len(institution_position.split('-')) == 2:
position = institution_position.split('-')[1]
if 'CEO' in position:
continue
if 'CFO' in position:
continue
institution = institution.replace('[', '')
institution = institution.replace(']', '')
institution = institution.replace('&', '&')
institution = institution.strip()
institutions.append(institution)
institutions_column.append(institutions)
calls['institutions'] = institutions_column
calls_explode = calls.explode('institutions')
ctry_code = pd.read_excel(path_dropbox + '\Conference Call Transcript\country code.xlsx')
calls_explode = calls_explode.merge(ctry_code, left_on = 'ctry', right_on = 'Alpha-2 code')
institution_ctry_nonus = pd.read_excel(path_dropbox + '\Conference Call Transcript\institution_nonus.xlsx', sheet_name='nonus')
institution_ctry_remain = pd.read_excel(path_dropbox + '\Conference Call Transcript\institution headquarters_remain2.xlsx', sheet_name='Institution')
institution_ctry_remain.rename(columns={"Country 1": "Country1", "Country 2": "Country2"}, inplace=True)
institution_ctry = pd.concat([institution_ctry_nonus, institution_ctry_remain])
calls_explode = calls_explode.merge(institution_ctry, left_on = 'institutions', right_on = 'Institution')
calls_final = calls_explode[~calls_explode['Country1'].isna()]
calls_final['Date'] = pd.to_datetime(calls_final['Date'])
calls_final['year'] = calls_final['Date'].dt.year
calls_final['quarter'] = calls_final['Date'].dt.quarter
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1114554688.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy calls_final['Date'] = pd.to_datetime(calls_final['Date']) C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1114554688.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy calls_final['year'] = calls_final['Date'].dt.year C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1114554688.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy calls_final['quarter'] = calls_final['Date'].dt.quarter
calls_final['foreign'] = (calls_final['Country1'] != calls_final['Alpha-3 code']) | ((calls_final['Country2'].notna()) & (calls_final['Country2'] != calls_final['Alpha-3 code']))
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\1998467780.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy calls_final['foreign'] = (calls_final['Country1'] != calls_final['Alpha-3 code']) | ((calls_final['Country2'].notna()) & (calls_final['Country2'] != calls_final['Alpha-3 code']))
calls_final = calls_final.rename(columns={"Alpha-3 code": "firm_ctry",
"Country1": "part_ctry1",
"Country2": "part_ctry2"})
calls_final.drop(['Error code', 'Error Description', 'Date'], axis=1, inplace=True)
calls_final['participants'] = calls_final['participants'].astype(str).replace('nan', np.nan)
calls_final['part_ctry2'] = calls_final['part_ctry2'].astype(str).replace('nan', np.nan)
calls_final.to_stata(path_dropbox + "\Conference Call Transcript\calls_part_full_0227.dta", version=118, write_index = False)
C:\Users\yifeilu\AppData\Local\Temp\ipykernel_27468\3662744895.py:1: InvalidColumnName:
Not all pandas column names were valid Stata variable names.
The following replacements have been made:
Alpha-2 code -> Alpha_2_code
If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)
calls_final.to_stata(path_dropbox + "\Conference Call Transcript\calls_part_full_0227.dta", version=118, write_index = False)